# 数据处理完整思路
# 1:载入需要的数据。原始数据中除抑郁问卷外,还包括焦虑问卷,因此只载入测量抑郁的问卷,及人口学信息。具体为DSRSC、CDI、PHQ-9、和DASS-21中测量抑郁的题目。
# 2:转换数据为数值型。
# 3:处理缺失值(实际没有缺失值)。
# 4:原始数据-1,用以方便计算检出率。原始数据的范围与量表正常的数据范围是不一致的。例如:DSRSC量表的计分范围正常应该为0-2,检出率为≥15。但咱们收到的原始数据计分范围就是1-3,也就是所有数值都+了1。因此-1后才能计算正确的检出率。
# 5:描述统计,原始数据的年龄范围显示为0-29岁,因此参考李医生的建议,筛除了7岁以下和18岁以上的人。
# 计算了1、性别比例。2、年龄范围。
# 6:数据质量检查。首先,使用原始数据进行了相关分析,用以检查数据。
# 发现问题:1、DSRSC问卷内相关有正有负。2、CDI问卷内相关有正有负。因此,数据应该进行反向计分,与李医生讨论过确实需要进行反向计分。
# 7:反向计分,以及反向计分后的相关分析。
# 8: 检验问卷内部合并的及跨问卷测量同一个症状的相关系数是否大于其他的
# 9:检出率计算
# 载入必要的包
library(here)
## here() starts at D:/心理健康测量/MH_CPL/5_Analysis/5_2_Measurement data analysis
library(bruceR)
##
## bruceR (v2023.9)
## Broadly Useful Convenient and Efficient R functions
##
## Packages also loaded:
## ✔ data.table ✔ emmeans
## ✔ dplyr ✔ lmerTest
## ✔ tidyr ✔ effectsize
## ✔ stringr ✔ performance
## ✔ ggplot2 ✔ interactions
##
## Main functions of `bruceR`:
## cc() Describe() TTEST()
## add() Freq() MANOVA()
## .mean() Corr() EMMEANS()
## set.wd() Alpha() PROCESS()
## import() EFA() model_summary()
## print_table() CFA() lavaan_summary()
##
## For full functionality, please install all dependencies:
## install.packages("bruceR", dep=TRUE)
##
## Online documentation:
## https://psychbruce.github.io/bruceR
##
## To use this package in publications, please cite:
## Bao, H.-W.-S. (2023). bruceR: Broadly useful convenient and efficient R functions (Version 2023.9) [Computer software]. https://CRAN.R-project.org/package=bruceR
##
## NEWS: A new version of bruceR (2024.6) is available (2024-06-13)!
##
## ***** Please update *****
## install.packages("bruceR", dep=TRUE)
##
## These packages are dependencies of `bruceR` but not installed:
## - cowplot, ggtext, see, lmtest, vars, phia, BayesFactor, GPArotation
##
## ***** Install all dependencies *****
## install.packages("bruceR", dep=TRUE)
##
## 载入程序包:'bruceR'
## The following object is masked _by_ 'package:data.table':
##
## %notin%
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ data.table::between() masks dplyr::between()
## ✖ Matrix::expand() masks tidyr::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ data.table::first() masks dplyr::first()
## ✖ lubridate::hour() masks data.table::hour()
## ✖ lubridate::isoweek() masks data.table::isoweek()
## ✖ dplyr::lag() masks stats::lag()
## ✖ data.table::last() masks dplyr::last()
## ✖ lubridate::mday() masks data.table::mday()
## ✖ lubridate::minute() masks data.table::minute()
## ✖ lubridate::month() masks data.table::month()
## ✖ Matrix::pack() masks tidyr::pack()
## ✖ lubridate::quarter() masks data.table::quarter()
## ✖ lubridate::second() masks data.table::second()
## ✖ purrr::transpose() masks data.table::transpose()
## ✖ Matrix::unpack() masks tidyr::unpack()
## ✖ lubridate::wday() masks data.table::wday()
## ✖ lubridate::week() masks data.table::week()
## ✖ lubridate::yday() masks data.table::yday()
## ✖ lubridate::year() masks data.table::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(openxlsx)
## Warning: 程序包'openxlsx'是用R版本4.4.1 来建造的
library(ggcorrplot)
## Warning: 程序包'ggcorrplot'是用R版本4.4.1 来建造的
#1.载入数据
raw_data <- bruceR::import(here::here("data", "Rawdata1.xlsx"))
## New names:
## • `量表ID` -> `量表ID...6`
## • `量表名称` -> `量表名称...7`
## • `量表ID` -> `量表ID...49`
## • `量表ID` -> `量表ID...69`
## • `量表ID` -> `量表ID...80`
## • `量表名称` -> `量表名称...81`
## • `量表ID` -> `量表ID...89`
## • `量表ID` -> `量表ID...98`
#筛选与抑郁有关的数据
selected_data <- raw_data %>%
select(就诊卡号, 性别, 年龄,
DSRSC1:DSRSC18,
PHQ1:PHQ9,
DASS3, DASS5, DASS10, DASS13, DASS16, DASS17, DASS21,
CDI1:CDI27)
#2.转换数据类型
selected_data_numeric <- selected_data %>%
mutate(性别 = factor(性别, levels = c("男", "女"), labels = c(1, 2))) %>%
mutate(across(matches("性别|年龄|DSRSC|PHQ|DASS|CDI"), as.numeric))
#3.处理缺失值
clean_data <- selected_data_numeric %>%
drop_na()
#4.原始数据-1,用以方便计算检出率
transformed_data <- clean_data %>%
mutate(across(matches("DSRSC|PHQ|DASS|CDI"), ~ .x - 1))
#5.筛选出年龄在7岁以上且不超过18岁的数据
filtered_data <- transformed_data %>%
filter(年龄 >= 7 & 年龄 <= 18)
# 描述统计
# 计算性别比例
gender_proportion <- filtered_data %>%
summarise(
male = sum(性别 == 1, na.rm = TRUE),
female = sum(性别 == 2, na.rm = TRUE),
total = n(),
male_proportion = male / total * 100,
female_proportion = female / total * 100
)
# 打印性别比例
print(gender_proportion)
## male female total male_proportion female_proportion
## 1 6704 6060 12764 52.52272 47.47728
# 计算年龄的范围
age_range <- filtered_data %>%
summarise(
min_age = min(年龄, na.rm = TRUE),
max_age = max(年龄, na.rm = TRUE)
)
# 打印年龄范围
print(age_range)
## min_age max_age
## 1 7 18
# 检查并创建输出目录
output_dir <- here::here("output")
if (!dir.exists(output_dir)) {
dir.create(output_dir)
}
# 6.数据质量检查,反向计分前的相关分析
correlation_matrix <- filtered_data %>%
select(matches("DSRSC|PHQ|DASS|CDI")) %>%
cor()
# 导出
output_file <- here::here("output", "correlation_matrix.xlsx")
write.xlsx(correlation_matrix, output_file)
# 可视化
p <- ggcorrplot(correlation_matrix, lab = TRUE,
method = "circle",
outline.color = "white",
colors = c("blue", "white", "red"),
ggtheme = ggplot2::theme_minimal(base_family = "sans"),
title = "未反向计分的相关系数") +
theme_minimal(base_family = "sans") +
theme(
text = element_text(size = 20, color = "black"),
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, size = 14),
axis.text.y = element_text(size = 14),
axis.title = element_text(size = 22),
plot.background = element_rect(fill = "gray90"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_rect(fill = "gray90", color = NA),
legend.position = "bottom",
legend.key.width = unit(4, "cm"),
legend.key.height = unit(1, "cm")
) +
scale_size_continuous(range = c(5, 12)) +
labs(x = "", y = "")
## Scale for size is already present.
## Adding another scale for size, which will replace the existing scale.
# 保存图片
ggsave(here::here("output", "correlation_heatmap.png"),
plot = p, width = 32, height = 28, dpi = 300)